R will run in parallel. A number of packages help with this. Some require “forking” which Windows will not support.
p_load(parallel)
p_load(MASS)
starts <- rep(100,4)
fx <- function(nstart) kmeans(Boston, 4, nstart=nstart)
numCores <- detectCores()
numCores
## [1] 4
microbenchmark(lapply(starts, fx))
## Unit: milliseconds
## expr min lq mean median uq max
## lapply(starts, fx) 128.7929 131.4228 135.5263 133.6529 136.6032 160.8576
## neval
## 100
microbenchmark(lapply(starts, fx), mclapply(starts, fx, mc.cores=numCores))
## Error in mclapply(starts, fx, mc.cores = numCores): 'mc.cores' > 1 is not supported on Windows
microbenchmark(results1 <- lapply(starts, fx),
results2 <- mclapply(starts, fx, mc.cores=numCores)
)
## Error in mclapply(starts, fx, mc.cores = numCores): 'mc.cores' > 1 is not supported on Windows
microbenchmark(lapply(starts, fx), mclapply(starts, fx))
## Unit: milliseconds
## expr min lq mean median uq
## lapply(starts, fx) 128.4495 132.5929 138.4325 135.1849 139.1556
## mclapply(starts, fx) 128.2621 132.5228 137.3198 134.5668 138.8637
## max neval cld
## 250.1233 100 a
## 229.9955 100 a
microbenchmark(results1 <- lapply(starts, fx),
results2 <- mclapply(starts, fx)
)
## Unit: milliseconds
## expr min lq mean median
## results1 <- lapply(starts, fx) 128.1570 132.1163 136.8817 134.6078
## results2 <- mclapply(starts, fx) 127.7875 132.7025 137.8772 134.9077
## uq max neval cld
## 139.2817 178.2303 100 a
## 138.3704 232.4874 100 a
We can use the doParallel package with the foreach package to improve looping.
First some setup.
p_load(foreach)
p_load(doParallel)
registerDoParallel(2)
getDoParWorkers()
## [1] 2
getDoParName()
## [1] "doParallelSNOW"
getDoParVersion()
## [1] "1.0.14"
Now carry out some simple calculations to show how to use parallel processing.
microbenchmark(
for (i in 1:10){
sqrt(i)
},
foreach(i=1:10) %do% {
sqrt(i)
},
foreach (i=1:10) %dopar% {
sqrt(i)
},
foreach (i=1:10, .combine=c) %dopar% {
sqrt(i)
}
)
## Unit: milliseconds
## expr min lq
## for (i in 1:10) { sqrt(i) } 1.7048 1.90695
## foreach(i = 1:10) %do% { sqrt(i) } 4.8721 5.28460
## foreach(i = 1:10) %dopar% { sqrt(i) } 9.3667 10.02070
## foreach(i = 1:10, .combine = c) %dopar% { sqrt(i) } 9.2575 9.93580
## mean median uq max neval cld
## 2.074626 2.00415 2.15235 3.7961 100 a
## 5.775400 5.53715 5.83250 11.6193 100 b
## 10.713187 10.36680 10.81185 19.9433 100 c
## 11.099086 10.21875 10.75500 50.4359 100 c
Note that the use of parallel processing in this case is not helping.
Try some bootstrapping to check performance on a “real” problem.
x <- iris[which(iris[,5] != "setosa"), c(1,5)]
trials <- 10000
microbenchmark(
### Parallel (%dopar%)
bs1 <- foreach(icount(trials), .combine=cbind) %dopar% {
ind <- sample(100, 100, replace=TRUE)
result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit))
coefficients(result1)
},
### Not parallel (%do%)
bs2 <- foreach(icount(trials), .combine=cbind) %do% {
ind <- sample(100, 100, replace=TRUE)
result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit))
coefficients(result1)
}
)
## Unit: seconds
## expr
## bs1 <- foreach(icount(trials), .combine = cbind) %dopar% { ind <- sample(100, 100, replace = TRUE) result1 <- glm(x[ind, 2] ~ x[ind, 1], family = binomial(logit)) coefficients(result1) }
## bs2 <- foreach(icount(trials), .combine = cbind) %do% { ind <- sample(100, 100, replace = TRUE) result1 <- glm(x[ind, 2] ~ x[ind, 1], family = binomial(logit)) coefficients(result1) }
## min lq mean median uq max neval cld
## 20.62518 25.93794 28.02471 28.67176 30.53672 36.54693 100 a
## 28.26366 36.22772 38.81467 38.58027 42.66944 50.61312 100 b
### Look at the bootstrap since we ran it
bs1 <- t(bs1)
bs2 <- t(bs2)
apply(bs1,2,mean)
## (Intercept) x[ind, 1]
## -13.112649 2.099639
apply(bs1,2,var)
## (Intercept) x[ind, 1]
## 9.5801903 0.2450442
apply(bs1,2,hist)
## $`(Intercept)`
## $breaks
## [1] -34 -32 -30 -28 -26 -24 -22 -20 -18 -16 -14 -12 -10 -8 -6 -4 -2
##
## $counts
## [1] 1 0 1 8 13 53 166 426 988 1911 2527 2423 1185 272
## [15] 24 2
##
## $density
## [1] 0.00005 0.00000 0.00005 0.00040 0.00065 0.00265 0.00830 0.02130
## [9] 0.04940 0.09555 0.12635 0.12115 0.05925 0.01360 0.00120 0.00010
##
## $mids
## [1] -33 -31 -29 -27 -25 -23 -21 -19 -17 -15 -13 -11 -9 -7 -5 -3
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $`x[ind, 1]`
## $breaks
## [1] 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5
##
## $counts
## [1] 1 28 950 3601 3466 1470 393 77 12 1 1
##
## $density
## [1] 0.0002 0.0056 0.1900 0.7202 0.6932 0.2940 0.0786 0.0154 0.0024 0.0002
## [11] 0.0002
##
## $mids
## [1] 0.25 0.75 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75 5.25
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
apply(bs2,2,mean)
## (Intercept) x[ind, 1]
## -13.172504 2.110227
apply(bs2,2,var)
## (Intercept) x[ind, 1]
## 10.212764 0.261791
apply(bs2,2,hist)
## $`(Intercept)`
## $breaks
## [1] -40 -38 -36 -34 -32 -30 -28 -26 -24 -22 -20 -18 -16 -14 -12 -10 -8
## [18] -6 -4 -2
##
## $counts
## [1] 1 0 0 0 0 2 12 19 73 195 466 942 1855 2581
## [15] 2375 1166 278 34 1
##
## $density
## [1] 0.00005 0.00000 0.00000 0.00000 0.00000 0.00010 0.00060 0.00095
## [9] 0.00365 0.00975 0.02330 0.04710 0.09275 0.12905 0.11875 0.05830
## [17] 0.01390 0.00170 0.00005
##
## $mids
## [1] -39 -37 -35 -33 -31 -29 -27 -25 -23 -21 -19 -17 -15 -13 -11 -9 -7
## [18] -5 -3
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $`x[ind, 1]`
## $breaks
## [1] 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5
##
## $counts
## [1] 43 906 3567 3483 1454 430 92 22 2 0 0 1
##
## $density
## [1] 0.0086 0.1812 0.7134 0.6966 0.2908 0.0860 0.0184 0.0044 0.0004 0.0000
## [11] 0.0000 0.0002
##
## $mids
## [1] 0.75 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75 5.25 5.75 6.25
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"